#--------------------------------------------------------#
# Libraries #
#--------------------------------------------------------#
set.seed(2020)
#library(ranger)
#library(dplyr)
#library(caret)
library(tidymodels)
library(usemodels)
#--------------------------------------------------------#
# Data #
#--------------------------------------------------------#
FS1_T0_Cluster_Assignment <-read.csv("D:\\DKE\\Thesis_related\\Implementation\\Hyper_tuning_results\\FS1_T0_Cluster_Assignment.csv")
#Convert Cluster ID from int to factor
FS1_T0_Cluster_Assignment<- FS1_T0_Cluster_Assignment %>% mutate(Cluster_ID=as.factor(Cluster_ID))
#--------------------------------------------------------#
There were 12 warnings (use warnings() to see them)
# Slitting Data #
#--------------------------------------------------------#
set.seed(123)
#ikea_split
FS1_T0_Cluster_split <- initial_split(FS1_T0_Cluster_Assignment, strata = Cluster_ID)
FS1_T0_Cluster_train <- training(FS1_T0_Cluster_split)
FS1_T0_Cluster_test <- testing(FS1_T0_Cluster_split)
set.seed(234)
FS1_T0_Cluster_folds <- bootstraps(FS1_T0_Cluster_train, strata = Cluster_ID)
FS1_T0_Cluster_folds
# Bootstrap sampling using stratification
#library(usemodels)
use_ranger(Cluster_ID ~ ., data = FS1_T0_Cluster_train)
ranger_recipe <-
recipe(formula = Cluster_ID ~ ., data = FS1_T0_Cluster_train)
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
set.seed(68322)
ranger_tune <-
tune_grid(ranger_workflow, resamples = [31mstop("add your rsample object")[39m, grid = [31mstop("add number of candidate points")[39m)
#--------------------------------------------------------#
# Start tuning #
#--------------------------------------------------------#
ranger_recipe <-
recipe(formula = Cluster_ID ~ ., data = FS1_T0_Cluster_train) %>% update_role(Instance_ID, new_role = "ID")
ranger_spec <-
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_mode("classification") %>%
set_engine("ranger")
ranger_workflow <-
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(ranger_spec)
ranger_folds <- vfold_cv(FS1_T0_Cluster_train)
set.seed(68322)
ranger_tune <-
tune_grid(ranger_workflow, resamples = ranger_folds, grid = 20)
ranger_tune %>% collect_metrics()
#--------------------------------------------------------#
There were 14 warnings (use warnings() to see them)
# graphical results #
#--------------------------------------------------------#
ranger_tune %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "AUC")

# Specifying the mtry and min_n range to understand better
rf_grid_ranger <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
rf_grid_ranger
doParallel::registerDoParallel()
ranger_tune <-
tune_grid(ranger_workflow, resamples = ranger_folds, grid = rf_grid_ranger)
show_best(ranger_tune, metric = "roc_auc")
show_best(ranger_tune, metric = "accuracy")
select_best(ranger_tune,metric = "roc_auc")
autoplot(ranger_tune)

#--------------------------------------------------------#
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
# graphical results #
#--------------------------------------------------------#
ranger_tune %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "AUC")

#--------------------------------------------------------#
Warning messages:
1: In readChar(file, size, TRUE) : truncating string with embedded nuls
2: In readChar(file, size, TRUE) : truncating string with embedded nuls
3: In readChar(file, size, TRUE) : truncating string with embedded nuls
4: In readChar(file, size, TRUE) : truncating string with embedded nuls
# Show and select the best #
#--------------------------------------------------------#
show_best(ranger_tune, metric = "roc_auc")
show_best(ranger_tune, metric = "accuracy")
select_best(ranger_tune,metric = "roc_auc")
autoplot(ranger_tune)

#--------------------------------------------------------#
# finalize the model and workflow #
#--------------------------------------------------------#
final_rf <- ranger_workflow %>%
finalize_workflow(select_best(ranger_tune))
No value of `metric` was given; metric 'roc_auc' will be used.
final_rf
== Workflow ==============================================================================================================================
[3mPreprocessor:[23m Recipe
[3mModel:[23m rand_forest()
-- Preprocessor --------------------------------------------------------------------------------------------------------------------------
0 Recipe Steps
-- Model ---------------------------------------------------------------------------------------------------------------------------------
Random Forest Model Specification (classification)
Main Arguments:
mtry = 10
trees = 1000
min_n = 2
Computational engine: ranger
#--------------------------------------------------------#
# fit the best tuned parameter on the training data #
#--------------------------------------------------------#
FS1_T0_Cluster_fit <- last_fit(final_rf, FS1_T0_Cluster_split)
FS1_T0_Cluster_fit
# Resampling results
# Manual resampling
collect_metrics(FS1_T0_Cluster_fit)
library(vip)
imp_spec <- ranger_spec %>%
finalize_model(select_best(ranger_tune)) %>%
set_engine("ranger", importance = "permutation")
No value of `metric` was given; metric 'roc_auc' will be used.
workflow() %>%
add_recipe(ranger_recipe) %>%
add_model(imp_spec) %>%
fit(FS1_T0_Cluster_train) %>%
pull_workflow_fit() %>%
vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))

Ranger package
- Above tuned values used for ranger model.
- Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data.
tidy_ranger_model <- ranger(Cluster_ID ~ ., data = FS1_T0_Cluster_train[,-1] , importance = "permutation",
local.importance = TRUE,mtry = 10,num.trees = 1000,classification = TRUE,
min.node.size = 2)
tidy_ranger_model
Ranger result
Call:
ranger(Cluster_ID ~ ., data = FS1_T0_Cluster_train[, -1], importance = "permutation", local.importance = TRUE, mtry = 10, num.trees = 1000, classification = TRUE, min.node.size = 2)
Type: Classification
Number of trees: 1000
Sample size: 1306
Number of independent variables: 44
Mtry: 10
Target node size: 2
Variable importance mode: permutation
Splitrule: gini
OOB prediction error: 26.49 %
Train accuracy
confusionMatrix(tidy_ranger_model$confusion.matrix)
Confusion Matrix and Statistics
predicted
true 1 2 3
1 268 114 8
2 75 479 57
3 2 90 213
Overall Statistics
Accuracy : 0.7351
95% CI : (0.7102, 0.7588)
No Information Rate : 0.523
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5773
Mcnemar's Test P-Value : 0.0002662
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 0.7768 0.7013 0.7662
Specificity 0.8730 0.7881 0.9105
Pos Pred Value 0.6872 0.7840 0.6984
Neg Pred Value 0.9159 0.7065 0.9351
Prevalence 0.2642 0.5230 0.2129
Detection Rate 0.2052 0.3668 0.1631
Detection Prevalence 0.2986 0.4678 0.2335
Balanced Accuracy 0.8249 0.7447 0.8383
Test accuracy
tidy_ranger_pred.data <- predict(tidy_ranger_model, data = FS1_T0_Cluster_test)
table(FS1_T0_Cluster_test$Cluster_ID, tidy_ranger_pred.data$predictions)
1 2 3
1 86 44 1
2 30 161 13
3 0 43 59
tidy_ranger_cm<-confusionMatrix(table(FS1_T0_Cluster_test$Cluster_ID, tidy_ranger_pred.data$predictions, dnn = c("Reference", "Prediction")))
tidy_ranger_cm
Confusion Matrix and Statistics
Prediction
Reference 1 2 3
1 86 44 1
2 30 161 13
3 0 43 59
Overall Statistics
Accuracy : 0.7002
95% CI : (0.6549, 0.7428)
No Information Rate : 0.5675
P-Value [Acc > NIR] : 7.837e-09
Kappa : 0.5138
Mcnemar's Test P-Value : 0.000194
Statistics by Class:
Class: 1 Class: 2 Class: 3
Sensitivity 0.7414 0.6492 0.8082
Specificity 0.8598 0.7725 0.8819
Pos Pred Value 0.6565 0.7892 0.5784
Neg Pred Value 0.9020 0.6266 0.9582
Prevalence 0.2654 0.5675 0.1670
Detection Rate 0.1968 0.3684 0.1350
Detection Prevalence 0.2998 0.4668 0.2334
Balanced Accuracy 0.8006 0.7108 0.8450
tidy(tidy_ranger_cm)
Conclusion : Model accuracy
- Train Accuracy : 73.66%
- Test Accuracy : 70.02%
- Both are close by. Balanced model.
---
title: "Tidy models along with ranger"
output: html_notebook
---


```{r}

#--------------------------------------------------------#
#                   Libraries                            #
#--------------------------------------------------------#
set.seed(2020)
#library(ranger)
#library(dplyr)
#library(caret)
library(tidymodels)
library(usemodels)
```



```{r}
#--------------------------------------------------------#
#                    Data                                 #
#--------------------------------------------------------#
FS1_T0_Cluster_Assignment <-read.csv("D:\\DKE\\Thesis_related\\Implementation\\Hyper_tuning_results\\FS1_T0_Cluster_Assignment.csv")

#Convert Cluster ID from int to factor
FS1_T0_Cluster_Assignment<- FS1_T0_Cluster_Assignment %>% mutate(Cluster_ID=as.factor(Cluster_ID))
```


```{r}
#--------------------------------------------------------#
#                  Slitting Data                         #
#--------------------------------------------------------#
set.seed(123)
#ikea_split
FS1_T0_Cluster_split <- initial_split(FS1_T0_Cluster_Assignment, strata = Cluster_ID)
FS1_T0_Cluster_train <- training(FS1_T0_Cluster_split)
FS1_T0_Cluster_test <- testing(FS1_T0_Cluster_split)

set.seed(234)
FS1_T0_Cluster_folds <- bootstraps(FS1_T0_Cluster_train, strata = Cluster_ID)
FS1_T0_Cluster_folds
```


```{r}

#--------------------------------------------------------#
# use_ranger : will give workflow pipeline details       #
#--------------------------------------------------------#
#library(usemodels)
use_ranger(Cluster_ID ~ ., data = FS1_T0_Cluster_train)
```




```{r}

#--------------------------------------------------------#
#                  Start tuning                          #
#--------------------------------------------------------#
ranger_recipe <- 
  recipe(formula = Cluster_ID ~ ., data = FS1_T0_Cluster_train) %>% update_role(Instance_ID, new_role = "ID") 

ranger_spec <- 
  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_mode("classification") %>% 
  set_engine("ranger") 

ranger_workflow <- 
  workflow() %>% 
  add_recipe(ranger_recipe) %>% 
  add_model(ranger_spec) 

ranger_folds <- vfold_cv(FS1_T0_Cluster_train)
set.seed(68322)
ranger_tune <-
  tune_grid(ranger_workflow, resamples = ranger_folds, grid = 20)
```



```{r}
#--------------------------------------------------------#
#    Tuned results and various metrics                   #
#--------------------------------------------------------#
ranger_tune %>% collect_metrics()
```


```{r}
#--------------------------------------------------------#
#            graphical results                           #
#--------------------------------------------------------#

ranger_tune %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  mutate(min_n = factor(min_n)) %>%
  ggplot(aes(mtry, mean, color = min_n)) +
  geom_line(alpha = 0.5, size = 1.5) +
  geom_point() +
  labs(y = "AUC")
```


```{r}
# Specifying the mtry and min_n range to understand better
rf_grid_ranger <- grid_regular(
  mtry(range = c(10, 30)),
  min_n(range = c(2, 8)),
  levels = 5
)

rf_grid_ranger

doParallel::registerDoParallel()

ranger_tune <-
  tune_grid(ranger_workflow, resamples = ranger_folds, grid = rf_grid_ranger)
```


```{r}
#--------------------------------------------------------#
#       Show and select the best                         #
#--------------------------------------------------------#

show_best(ranger_tune, metric = "roc_auc")
show_best(ranger_tune, metric = "accuracy")

select_best(ranger_tune,metric = "roc_auc")
autoplot(ranger_tune)

```

```{r}
#--------------------------------------------------------#
#            graphical results                           #
#--------------------------------------------------------#

ranger_tune %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  mutate(min_n = factor(min_n)) %>%
  ggplot(aes(mtry, mean, color = min_n)) +
  geom_line(alpha = 0.5, size = 1.5) +
  geom_point() +
  labs(y = "AUC")
```
```{r}
#--------------------------------------------------------#
#       Show and select the best                         #
#--------------------------------------------------------#

show_best(ranger_tune, metric = "roc_auc")
show_best(ranger_tune, metric = "accuracy")

select_best(ranger_tune,metric = "roc_auc")
autoplot(ranger_tune)
```


```{r}
#--------------------------------------------------------#
#       finalize the model and workflow                  #
#--------------------------------------------------------#

final_rf <- ranger_workflow %>%
  finalize_workflow(select_best(ranger_tune))

final_rf
```

```{r}

#--------------------------------------------------------#
# fit the best tuned parameter on the training data      #
#--------------------------------------------------------#
FS1_T0_Cluster_fit <- last_fit(final_rf, FS1_T0_Cluster_split)
FS1_T0_Cluster_fit

```


```{r}
collect_metrics(FS1_T0_Cluster_fit)
```

```{r}
library(vip)

imp_spec <- ranger_spec %>%
  finalize_model(select_best(ranger_tune)) %>%
  set_engine("ranger", importance = "permutation")

workflow() %>%
  add_recipe(ranger_recipe) %>%
  add_model(imp_spec) %>%
  fit(FS1_T0_Cluster_train) %>%
  pull_workflow_fit() %>%
  vip(aesthetics = list(alpha = 0.8, fill = "midnightblue"))
```


---
# Ranger package 
* Above tuned values used for ranger model. </br>
* Ranger is a fast implementation of random forests (Breiman 2001) or recursive partitioning, particularly suited for high dimensional data. </br> 

```{r}
tidy_ranger_model <- ranger(Cluster_ID ~ ., data = FS1_T0_Cluster_train[,-1] , importance = "permutation", 
                            local.importance = TRUE,mtry = 10,num.trees = 1000,classification = TRUE,
                            min.node.size = 2)
```

```{r}
tidy_ranger_model
```


# Train accuracy

```{r}
confusionMatrix(tidy_ranger_model$confusion.matrix)
```

# Test accuracy
```{r}
tidy_ranger_pred.data <- predict(tidy_ranger_model, data = FS1_T0_Cluster_test)
table(FS1_T0_Cluster_test$Cluster_ID, tidy_ranger_pred.data$predictions)

```

```{r}
tidy_ranger_cm<-confusionMatrix(table(FS1_T0_Cluster_test$Cluster_ID, tidy_ranger_pred.data$predictions, dnn = c("Reference", "Prediction")))
tidy_ranger_cm
tidy(tidy_ranger_cm)
```


# Conclusion : Model accuracy

* Train Accuracy : 73.66% </br>
* Test Accuracy : 70.02% </br>
* Both are close by. Balanced model. </br>